Capítulo I

Importar datos

# Read a csv file data and name the object
mc_mode_choice <-  read_csv(system.file("extdata", "mc_commute.csv", package = "discrtr"), show_col_types = FALSE)
head(mc_mode_choice)
## # A tibble: 6 × 39
##   RespondentID choice avcycle avwalk avhsr avcar timecycle timewalk accesshsr
##          <dbl>  <dbl>   <dbl>  <dbl> <dbl> <dbl>     <dbl>    <dbl>     <dbl>
## 1    566872636      3       0      1     1     0      6.21     21.3      3   
## 2    566873140      3       0      1     1     1      3.73     12.8      4   
## 3    566874266      3       0      0     1     1 100000    100000        3   
## 4    566874842      2       1      1     1     0      5.83     20       10.7 
## 5    566881170      2       1      1     1     0      5.83     20        6.21
## 6    566907438      2       0      1     1     0 100000        10        7.14
## # ℹ 30 more variables: waitingtimehsr <dbl>, transfer <dbl>, timehsr <dbl>,
## #   timecar <dbl>, parking <dbl>, vehind <dbl>, owncycle <dbl>, gender <dbl>,
## #   work <dbl>, visa <dbl>, age <dbl>, solo <dbl>, shared <dbl>, family <dbl>,
## #   child <dbl>, primary_caregiver <dbl>, LAT <dbl>, LONG <dbl>, DAUID <dbl>,
## #   mhi <dbl>, dwell_den <dbl>, lum <dbl>, st_den <dbl>, inter_den <dbl>,
## #   SF_P_ratio <dbl>, side_den <dbl>, Shelters_SD <dbl>, Shelters_D <dbl>,
## #   Shelters_A <dbl>, Shelters_SA <dbl>

Glimpse

glimpse(mc_mode_choice)
## Rows: 1,376
## Columns: 39
## $ RespondentID      <dbl> 566872636, 566873140, 566874266, 566874842, 56688117…
## $ choice            <dbl> 3, 3, 3, 2, 2, 2, 2, 3, 3, 2, 2, 4, 2, 2, 3, 2, 4, 3…
## $ avcycle           <dbl> 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ avwalk            <dbl> 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1…
## $ avhsr             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ avcar             <dbl> 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0…
## $ timecycle         <dbl> 6.211180e+00, 3.726708e+00, 1.000000e+05, 5.828157e+…
## $ timewalk          <dbl> 21.31439, 12.78863, 100000.00000, 20.00000, 20.00000…
## $ accesshsr         <dbl> 3.00, 4.00, 3.00, 10.66, 6.21, 7.14, 10.66, 15.00, 2…
## $ waitingtimehsr    <dbl> 15.00, 15.00, 2.00, 10.23, 10.23, 10.23, 10.23, 3.00…
## $ transfer          <dbl> 0e+00, 0e+00, 0e+00, 0e+00, 0e+00, 0e+00, 0e+00, 1e+…
## $ timehsr           <dbl> 5, 10, 15, 8, 5, 3, 20, 25, 8, 5, 5, 25, 2, 3, 25, 5…
## $ timecar           <dbl> 100000, 2, 4, 100000, 100000, 100000, 5, 17, 4, 1000…
## $ parking           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0…
## $ vehind            <dbl> 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ owncycle          <dbl> 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0…
## $ gender            <dbl> 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ work              <dbl> 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0…
## $ visa              <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ age               <dbl> 21, 23, 20, 20, 19, 19, 49, 19, 20, 23, 25, 38, 20, …
## $ solo              <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ shared            <dbl> 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0…
## $ family            <dbl> 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1…
## $ child             <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1…
## $ primary_caregiver <dbl> 1e+05, 1e+05, 0e+00, 1e+05, 1e+05, 1e+05, 1e+00, 0e+…
## $ LAT               <dbl> 43.26302, 43.25885, 43.25222, 43.25782, 43.25562, 43…
## $ LONG              <dbl> -79.90074, -79.90476, -79.93953, -79.91941, -79.9204…
## $ DAUID             <dbl> 35250503, 35250675, 35250964, 35250669, 35250669, 35…
## $ mhi               <dbl> 3.3902, 4.5770, 6.3081, 5.4911, 5.4911, 9.8697, 6.30…
## $ dwell_den         <dbl> 941.3980, 1688.5725, 534.6675, 892.1744, 892.1744, 3…
## $ lum               <dbl> 0.805636, 0.280830, 0.455743, 0.479460, 0.479460, 0.…
## $ st_den            <dbl> 14.376206, 19.497536, 13.556608, 14.307826, 14.30782…
## $ inter_den         <dbl> 39.224916, 109.529025, 15.276213, 45.883253, 45.8832…
## $ SF_P_ratio        <dbl> 0.230931, 0.356169, 0.074477, 0.268249, 0.268249, 0.…
## $ side_den          <dbl> 22.633222, 39.640032, 8.228497, 37.457574, 37.457574…
## $ Shelters_SD       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Shelters_D        <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1…
## $ Shelters_A        <dbl> 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0…
## $ Shelters_SA       <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

Info. de las primeras 5 variables

summary(mc_mode_choice[1:5])
##   RespondentID           choice         avcycle           avwalk      
##  Min.   :566872636   Min.   :1.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:567814188   1st Qu.:2.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :568682048   Median :2.000   Median :0.0000   Median :1.0000  
##  Mean   :570566454   Mean   :2.618   Mean   :0.2747   Mean   :0.6613  
##  3rd Qu.:574925212   3rd Qu.:3.000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :587675235   Max.   :4.000   Max.   :1.0000   Max.   :1.0000  
##      avhsr       
##  Min.   :0.0000  
##  1st Qu.:1.0000  
##  Median :1.0000  
##  Mean   :0.9608  
##  3rd Qu.:1.0000  
##  Max.   :1.0000

Para convertir la elección de variable en un factor hacemos…

mc_mode_choice$choice <-  factor(mc_mode_choice$choice, 
                                 labels = c("Cycle", "Walk", "HSR", "Car"))

Y se aplica summary

summary(mc_mode_choice$choice)
## Cycle  Walk   HSR   Car 
##    48   711   336   281

Para resumir una variable…

summary(mc_mode_choice$timecycle)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##      0.29      3.79      5.83  34014.86 100000.00 100000.00
time.Active.clean <- mc_mode_choice |> dplyr::select(timecycle, timewalk) |> 
                     filter(timecycle!=100000 & timewalk!=100000)

Se obtine un resumen estadístico:

summary(time.Active.clean)
##    timecycle          timewalk    
##  Min.   : 0.2914   Min.   : 1.00  
##  1st Qu.: 2.9141   1st Qu.:10.00  
##  Median : 4.3711   Median :15.00  
##  Mean   : 4.5852   Mean   :16.10  
##  3rd Qu.: 5.8282   3rd Qu.:20.00  
##  Max.   :17.4845   Max.   :62.11

Se grafica el resumen estadístico

ggplot(data = time.Active.clean) +
  geom_area(aes(x = timecycle), stat = "bin", binwidth = 5, fill = "blue", color = "black", alpha = 0.6) +
  geom_area(aes(x = timewalk), stat = "bin", binwidth = 5, fill = "yellow", color = "black", alpha = 0.6) +
  xlab("Tiempo (minutos)")

Resumen con 2 variables

mc_mode_choice |> select(c("choice", "side_den")) |> summary()
##    choice       side_den    
##  Cycle: 48   Min.   : 0.00  
##  Walk :711   1st Qu.:18.19  
##  HSR  :336   Median :22.63  
##  Car  :281   Mean   :24.18  
##              3rd Qu.:35.70  
##              Max.   :59.41

Se traza la variable categórica

ggplot(mc_mode_choice)+
  geom_boxplot(aes(choice, side_den))

Ejercicio

Invoke data set Mode from package mlogit. To do this you need to first load the package. This is a data set with choices about mode of transportation. This is done as follows:

data("Mode")

Once you have loaded the data set, answer the following questions: Describe this data set. How many variables are there and of which type (i.e., categorical/quantitative)?

glimpse(Mode)
## Rows: 453
## Columns: 9
## $ choice       <fct> car, rail, car, car, car, car, car, car, bus, car, rail, …
## $ cost.car     <dbl> 1.5070097, 6.0569985, 5.7946769, 1.8691439, 2.4989523, 4.…
## $ cost.carpool <dbl> 2.3356118, 2.8969191, 2.1374543, 2.5724266, 1.7220099, 0.…
## $ cost.bus     <dbl> 1.800512, 2.237128, 2.576385, 1.903518, 2.686000, 1.84765…
## $ cost.rail    <dbl> 2.358920, 1.855450, 2.747479, 2.268276, 2.973866, 2.31005…
## $ time.car     <dbl> 18.503200, 31.311107, 22.547429, 26.090282, 4.699140, 3.0…
## $ time.carpool <dbl> 26.338233, 34.256956, 23.255171, 29.896023, 12.414084, 9.…
## $ time.bus     <dbl> 20.86779, 67.18189, 63.30906, 19.75270, 43.09204, 12.8256…
## $ time.rail    <dbl> 30.03347, 60.29313, 49.17164, 13.47268, 39.74325, 43.5442…

How many different modes of transportation are in this data set? What is the most popular mode? What is the least popular mode?

summary(Mode)
##      choice       cost.car       cost.carpool       cost.bus    
##  car    :218   Min.   :0.4099   Min.   :0.1293   Min.   :1.013  
##  carpool: 32   1st Qu.:3.6964   1st Qu.:0.9519   1st Qu.:1.783  
##  bus    : 81   Median :4.8796   Median :1.6665   Median :2.027  
##  rail   :122   Mean   :4.8735   Mean   :1.6863   Mean   :2.036  
##                3rd Qu.:6.2255   3rd Qu.:2.4581   3rd Qu.:2.321  
##                Max.   :8.8555   Max.   :3.2953   Max.   :2.740  
##    cost.rail        time.car       time.carpool       time.bus     
##  Min.   :1.272   Min.   : 2.404   Min.   : 8.385   Min.   : 1.969  
##  1st Qu.:1.947   1st Qu.:21.835   1st Qu.:28.391   1st Qu.:25.457  
##  Median :2.198   Median :37.497   Median :40.637   Median :41.415  
##  Mean   :2.212   Mean   :37.044   Mean   :39.771   Mean   :39.923  
##  3rd Qu.:2.476   3rd Qu.:53.104   3rd Qu.:51.843   3rd Qu.:52.805  
##  Max.   :3.113   Max.   :66.871   Max.   :65.009   Max.   :75.681  
##    time.rail     
##  Min.   : 4.621  
##  1st Qu.:28.143  
##  Median :40.034  
##  Mean   :39.505  
##  3rd Qu.:49.172  
##  Max.   :73.998
ggplot(Mode)+
  geom_bar(aes(choice, fill=choice), color="black", show.legend = FALSE)+
  theme_bw()

In general, what is the most expensive mode? The least expensive?

ggplot(Mode)+
  geom_density(aes(cost.car, fill="car"), color="black", alpha=0.5)+
  geom_density(aes(cost.carpool , fill="carpool"), color="black", alpha=0.5)+
  geom_density(aes(cost.bus, fill="bus"), color="black", alpha=0.5)+
  geom_density(aes(cost.rail, fill="rail"), color="black", alpha=0.5)+
  scale_fill_manual(values=c("car"= "firebrick", "carpool"="dodgerblue", "bus"="darkgoldenrod2", "rail"="cyan"), name="Mode")+
  xlab("Cost")+
theme_bw()

Create a plot showing the univariate distributions of time by car and time by bus. Discuss.

grafica_time <- ggplot(Mode)+
  geom_density(aes(time.car, fill="car"), color="black", alpha=0.5)+
  geom_density(aes(time.carpool , fill="carpool"), color="black", alpha=0.5)+
  geom_density(aes(time.bus, fill="bus"), color="black", alpha=0.5)+
  geom_density(aes(time.rail, fill="rail"), color="black", alpha=0.5)+
  scale_fill_manual(values=c("car"= "firebrick", "carpool"="dodgerblue", "bus"="darkgoldenrod2", "rail"="cyan"), name="Mode")+
  xlab("Time")+
theme_bw()


ggplotly(grafica_time)  

How do choices relate to cost by the different modes?

varnum <- Mode |> dplyr::select(where(is.numeric))
varnum <- names(varnum)

for (var in varnum) {
grafica_box <- ggplot(Mode)+
  geom_boxplot(aes(choice, eval(as.name(var)), fill=choice), show.legend=FALSE)+
  ylab(var)+
  theme_bw()
  
  print(grafica_box)
    
}

Ejercicios:

4.- Describe los datos del ejercicio.¿Cuántas variables hay y de qué tipo (categoricas o cuantitativas)?

Una variable categórica y 8 cuantitativas en el data set Mode

5.- ¿Cuantos modos diferentes de transportación hay en este ejercicio?, ¿Cuál es el más popular? y ¿Cuál es el menos popular?

Hay cuatro: ° Carro (más popular) ° Compartido ° Autobus ° Caminar (menos popular)

6.- En general, ¿Cuál es el modo más costoso de transpprte y cuál el menos costoso?

El auto es el más costoso y el menos costoso es el auto compartido

7.- Crea un plot mostrando la distribución univariable del tiempo en el auto y autobus.

grafica_time <- ggplot(Mode)+
  geom_density(aes(time.car, fill="car"), color="black", alpha=0.5)+
  geom_density(aes(time.bus, fill="bus"), color="black", alpha=0.5)+
  scale_fill_manual(values=c("car"= "firebrick", "carpool"="dodgerblue", "bus"="darkgoldenrod2", "rail"="cyan"), name="Mode")+
  xlab("Time")+
theme_bw()


ggplotly(grafica_time)  

En terminos generales el auto es más eficiente en distancias cortas, sin embargo el autobus en trayectos superiores a 45 min (aprox) se vuelve más eficiente

7.- ¿cómo las preferencias se relacionan con el costo en los diferentes modos de transporte?

varnum <- Mode |> dplyr::select(where(is.numeric))
varnum <- names(varnum)

for (var in varnum) {
grafica_box <- ggplot(Mode)+
  geom_boxplot(aes(choice, eval(as.name(var)), fill=choice), show.legend=FALSE)+
  ylab(var)+
  theme_bw()
  
  print(grafica_box)
    
}